#! python3
# phageDisplaySeqAnalysis.py - Analyse University of Guelph's AAC Genomics Facility Sanger sequencing output files. Renames, reorganises, converts, trims, and aligns .seq/.ab1 files. Final aa and nt alignments are in fasta, clustal, and xlsx formats. Sequences that cannot be trimmed at the 5' end are excluded from the multiple sequence alignment and moved to a separate folder for manual analysis.

####################################
#    Preamble
####################################

# Usage notes:
# * All final alignments exclude sequences that weren't full length and those that have premature stop codons.
# * Final alignments will include amino acid/nucleotide sequences with Xs and Ns, respectively.
# * Sequences that aren't full length and include premature stop codons are, however, included in batch files, but never the final alignment.

# Compatibility notes:
# * Must use Biopython 1.77 and no later version. If using a more current version, change alignment codes from "alignment.format(format_spec)" to "format(alignment, format_spec".
# * If using Spyder as your IDE, use a version that isn't version 5. This version for some reason has conflicts with the xlsxwriter package and won't get past importing modules.
# * This code is confirmed to work with python 3.8.8. Later versions may work but have not been verified.
# * Confirmed to work in Windows, unconfirmed in Macs and Linux but should work in theory (may need to change lines regarding path names so that the format matches the OS, currently these are optimised for Windows' format).

# To do:
# * Add work-arounds for when code breaks; have it bypass the issue and finish analysis, leaving the user to manually analyse problem sequences.
# * To facilitate the above point, add functions that do nt-aa conversion, trimming, etc. outside of the program.
# * Add custom warnings for common errors that break script, give suggestions as to what went wrong.

####################################
#    Modules
####################################

import os, glob, re, shutil, logging, xlsxwriter
from shutil import copyfile
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio import AlignIO
from collections import Counter, OrderedDict

####################################
#    Classes
####################################

# Ordered list of counts.
class OrderedCounter(Counter, OrderedDict):
    pass

####################################
#    Code
####################################

##################
# Colours for print functions.
##################

cyan = lambda text: '\033[0;36m' + text + '\033[0m'
green = lambda text: '\033[0;32m' + text + '\033[0m'

##################
# Setup
##################

# Choose data source and change working directory.
print(cyan('\nScript started. This will process AAC Genomics Facility (University of Guelph) output by renaming, converting, trimming, and aligning .seq files.') +
      cyan('\n\nEnter folder location/path where .seq files are located:'))
path = input()
path = path.replace('\\', '/')
os.chdir(path)

# Logging setup.
pathRegex = re.compile(r'([\w.]+)$')
locList = pathRegex.findall(path)
locStr = str(locList[0])
logging.basicConfig(filename = path + "/" + locStr + ".log",
                    level = logging.INFO,
                    format = '%(asctime)s - %(message)s',
                    filemode = 'w')
logging.info('Working directory changed to %s.' % (path))

##################
# Data Analysis
##################

# Create rawSeq folder.
if not os.path.exists("./01. rawSeq"):
    os.makedirs('01. rawSeq')
    renameLoc = path + '/01. rawSeq'
    logging.info('%s directory created.' % (renameLoc))
# Rename each raw .seq file to remove well designations used by AAC, add the number 0 to names that have single numbers in them (facilitates downstreeam organization), and move to rawSeq folder.
wellRegex = re.compile(r'([_][A-H]\d\d[.])')
for fileName in glob.glob('*.seq'):
    shortName = wellRegex.sub('.', fileName)
    newName = re.sub(r'(^\w{1}[_]\w{1})(\d{1}[_]\s*\w*)', r'\g<1>0\g<2>', shortName)
    oldName = path + '/' + fileName
    newName = path + '/01. rawSeq/' + newName
    shutil.move(oldName, newName)
    renameLoc = path + '/01. rawSeq'
    logging.info('Sequence renamed from "%s" "%s" and moved to %s.' % (fileName, shortName, renameLoc))

# Create rawAb1 folder.
if not os.path.exists("./02. rawAb1"):
    os.makedirs('02. rawAb1')
    renameLoc = path + '/02. rawAb1'
    logging.info('%s directory created.' % (renameLoc))
# Rename each raw .ab1 file to remove well designations used by AAC, add the number 0 to names that have single numbers in them (facilitates downstreeam organization), and move to rawAb1 folder.
for fileName in glob.glob('*.ab1'):
    shortName = wellRegex.sub('.', fileName)
    newName = re.sub(r'(^\w{1}[_]\w{1})(\d{1}[_]\s*\w*)', r'\g<1>0\g<2>', shortName)
    oldName = path + '/' + fileName
    newName = path + '/02. rawAb1/' + newName
    shutil.move(oldName, newName)
    renameLoc = path + '/02. rawAb1'
    logging.info('Chromatogram renamed from "%s" "%s" and moved to %s.' % (fileName, shortName, renameLoc))
print(green('\nRenaming finished.'))
    
# Convert files from .seq to .fasta.
os.chdir("./01. rawSeq")
for fileName in glob.glob('*.seq'):
    with open(fileName, 'r') as file:
        seq = file.read().replace('\n', '')
        name = fileName.replace('.seq', '')
        seq = ">" + name + "\n" + seq + "\n"
# Write output fasta file.
        output = name + '.fasta'
        with open(output, "w") as newFile:
            newFile.write(seq)
            convertLoc = path + '/01. rawSeq'
            logging.info('Sequence converted from "%s" to "%s" and moved to %s.' % (fileName, output, convertLoc))
        file.close()
print(green('\n.seq to .fasta conversion finished.'))

# Create folder for .fasta files.
if not os.path.exists("../03. ntFasta"):
    os.chdir(path)
    os.makedirs('03. ntFasta')
    fastaLoc = path + '/03. ntFasta'
    logging.info('%s directory created.' % (fastaLoc))
    os.chdir('./01. rawSeq/')
    
# Move fasta files to ntFasta folder.
for fileName in glob.glob('*.fasta'):
    oldPath = path + '/01. rawSeq/' + fileName
    newPath = path + '/03. ntFasta/' + fileName
    shutil.move(oldPath, newPath)
    fastaLoc = path + '/03. ntFasta'
    logging.info('"%s" moved to %s.' % (fileName, fastaLoc))

# Append all .fasta files together into a single batch file.
os.chdir('../03. ntFasta')
name1 = locStr
name2 = locStr + '.fasta'
fileNames = glob.glob('*.fasta')
with open(name2, 'w') as f:
    for file in fileNames:
        with open(file) as inFile:
            f.write(inFile.read() + '\n')
            logging.info('"%s" written to "%s".' % (file, name2))
            inFile.close()
print(green('\nBatch file ("%s") created.' % (name2)))
logging.info('Batch file ("%s") created.' % (name2))
       
# Trim sequences at 5' end by sequence.
print (cyan('''\nEnter sequence to begin 5' trim at (not case-sensitive):
E.g. For UbVs, use AAAATG (last FLAG residue + start codon).
Otherwise use AUG start codon plus a few conserved nucleotides beyond it.
         
Note: Using conserved nucleotides is a must or else sequences will be trimmed at multiple sites, producing different size products and preventing alignment.'''))
start = input()
start = start.upper()
fileList = glob.glob('*.fasta')
for fileName in fileList:
    os.chdir(path + '/03. ntFasta')
    index = 0
    with open(fileName, 'r') as file:
        lines = file.readlines()
        seq = str(lines[1])
        index = seq.find(start, index)
        if index != -1:
            trimmedSeq = seq[index:]
            newName = fileName.replace('.fasta', '_ntTrimmed.fasta')
            with open(newName, "w") as newFile:
                newFasta = newName.replace('.fasta', '')
                newFile.write('>' + newFasta + '\n' + trimmedSeq)
                logging.info('%s found in "%s" at %d. Trimmed sequence exported as "%s".' % (start, fileName, index, newName))
            index += 1
# Move untrimmed sequences to noTrim folder.
        elif index == -1:
            if not os.path.exists(path + "/noTrim"):
                os.chdir(path)
                os.makedirs('noTrim')
                newPath = path + '/noTrim'
                logging.info('%s directory created.' % (newPath))
            noTrim = path + "/noTrim"
            os.chdir(noTrim)
            newName = fileName.replace('.fasta', '_5trimfail.fasta')
            with open(newName, "w") as newFile:
                newFasta = newName.replace('.fasta', '')
                newFile.write('>' + newFasta + '\n' + seq)
                logging.info('%s not found in "%s". Sequence exported as "%s" to %s.' % (start, fileName, newName, noTrim))
                newFile.close()
        file.close()
print(green('''\n5' trim finished.'''))

# Create ntTrimmed folder.
if not os.path.exists("../04. ntTrimmed"):
    os.chdir(path)
    os.makedirs('04. ntTrimmed')
    trimLoc = path + '/04. ntTrimmed'
    logging.info('%s directory created.' % (trimLoc))            

# Trim sequences at 3' end by total length.
print (cyan('''\nEnter the number of nucleotides downstream of the 5' trim site to trim at:
E.g. For UbVs, use 237 nt.'''))
end = input()
end = int(end)
os.chdir(path + '/03. ntFasta')
fileList = glob.glob('*trimmed.fasta')
for fileName in fileList:
    os.chdir(path + '/03. ntFasta')
    with open(fileName, 'r') as file:
        lines = file.readlines()
        fastaName = str(lines[0])
        seq = str(lines[1])
        finalTrimmedSeq = seq[0:end]
        file.close()
# Move untrimmed sequences to noTrim folder.
        if len(finalTrimmedSeq) < end:
            if not os.path.exists(path + "/noTrim"):
                os.chdir(path)
                os.makedirs(path + '/noTrim')
                newPath = path + '/noTrim'
                logging.info('%s directory created.' % (newPath))
            os.chdir(path + '/noTrim')
            newName = fileName.replace('_ntTrimmed.fasta', '_3trimfail.fasta')
            with open(newName, "w") as newFile:
                newFasta = newName.replace('.fasta', '')
                newFile.write('>' + newFasta + '\n' + seq)
                newFile.close()
                logging.info('Sequence trimmed %s nt downstream from %s. Trimmed sequence exported as "%s".' % (end, start, fileName))
        else:
            with open(fileName, 'r') as file:
                lines = file.readlines()
                fastaName = str(lines[0])
                seq = str(lines[1])
                finalTrimmedSeq = seq[0:end]
                logging.info('%s trimmed %s nt downstream.' % (file, end))
                with open(fileName, "w") as newFile:
                    newFile.write(fastaName + finalTrimmedSeq)
                    newFile.close()
                oldPath = path + '/03. ntFasta/'  + fileName
                newPath = path + '/04. ntTrimmed/' + fileName
                file.close()
                shutil.move(oldPath, newPath)
                logging.info('"%s" moved to %s.' % (fileName, newPath))
                file.close()
print(green('''\n3' trim finished.'''))
    
# Clean up unnecessary/faulty trimmed files.
os.chdir(path + '/04. ntTrimmed')
name3 = name1 + '_ntTrimmed.fasta'
if os.path.exists(name3) == True:
    os.remove(name3)
    logging.info('Deleted %s (faulty batch file).' % (name3))
else:
    pass

# Append all trimmed nucleotide fasta files together into a single batch file.
if os.path.exists(path + '/03. ntFasta/' + name2):
    if os.path.exists(path + "/04. ntTrimmed"):
        fileNames = glob.glob('*trimmed.fasta')
        with open(name3, 'w') as newFile:
            for file in fileNames:
                with open(file) as infile:
                    newFile.write(infile.read() + '\n\n')
                    logging.info('"%s" written to "%s".' % (file, name3))
            newFile.close()
        print(green('\nTrimmed nucleotide batch file exported as "%s".' % (name3)))
        logging.info('Trimmed nucleotide batch file exported as "%s".' % (name3))

# Create 'aaTrimmed' folder.
os.chdir(path + "/04. ntTrimmed")
transLoc = path + "/04. ntTrimmed"
logging.info('Moved to %s.' % (transLoc))
os.makedirs(path + '/05. aaTrimmed')
aaLoc = path + '/05. aaTrimmed'
logging.info('%s created.' % (aaLoc))

# Make copies of ntTrimmed files in aaTrimmed folder with new names.
ntTrimRegex = re.compile(r'(_ntTrimmed)')
aaTrimRegex = re.compile(r'(_aaTrimmed)')
fileList = glob.glob('*.fasta')
for fileName in fileList:
    oldPath = transLoc + '/' + fileName
    fileName2 = ntTrimRegex.sub('_aaTrimmed', fileName)
    newPath = aaLoc + '/' + fileName2
    copyfile(oldPath, newPath)
    logging.info('%s moved from %s to %s.' % (fileName2, oldPath, newPath))

# Translate trimmed nucleotides to amino acids.
os.chdir(aaLoc)
fileList = glob.glob('*.fasta')
for fileName in fileList:
    with open(fileName, "r+") as newFile:
# Extract data.
        lines = newFile.readlines()
        fastaName = lines[0]
        fastaName = ntTrimRegex.sub('_aaTrimmed', fastaName)
        ntSeq = lines[1]
# Remove previous file contents
        newFile.seek(0)
        newFile.truncate()
# Translate.
        gene = Seq(ntSeq, generic_dna)
        try:
            aaSeq = gene.translate(table = 'Standard', stop_symbol = '*')
            aaSeq = str(aaSeq)
# Write new contents to file.            
            newFile.write(fastaName + aaSeq)
            logging.info('"%s" translated.' % (fileName))
        except:
            logging.info('"%s" could not be translated.' % (fileName))
            pass
        newFile.close()
print(green('''\nTranslation finished.'''))
        
# Clean up faulty amino acid trimmed file.
name6 = ntTrimRegex.sub('_aaTrimmed', name3)
os.remove(name6)
logging.info('Deleted %s (faulty batch file).' % (name6))

# Append all trimmed amino acid fasta files together into a single batch file.
fileNames = glob.glob('*.fasta')
with open(name6, 'w') as f:
    for file in fileNames:
        with open(file) as inFile:
            f.write(inFile.read() + '\n')
            logging.info('%s written to %s.' % (file, name6))
            inFile.close()
    f.close()
print(green('\nTranslated batch file (%s) created.' % (name6)))
logging.info('Translated batch file (%s) created.' % (name6))

# Remove truncated amino acid sequences to prepare for alignment.
os.chdir(path + '/05. aaTrimmed')
with open(name6, "r") as batchFile:
    batchData = batchFile.readlines()
    batchFile.close()
dataRegex = re.compile('^\w+')
batchList = []
batchList2 = []
# Separate amino acid sequences from names in batch file.
for entry in batchData:
    if re.match(dataRegex, entry):
        batchList += [entry]
    else:
        batchList2 += [entry]
dataList = [entry[:-1] for entry in batchList]
nameList = [entry[:-1] for entry in batchList2]
# Remove blank entries in list, if present.
nameList = list(filter(None, nameList))
# Remove sequences that aren't full length.
listLen = [len(entry) for entry in dataList]
maxLen = max(listLen)
aaAlignList = []
for entry in dataList:
    if len(entry) == maxLen: 
        aaAlignList.append(entry)
    elif len(entry) < maxLen:
        logging.info('%s removed from pool for alignment.' % (entry))
        pass
aaAlignDict = dict(zip(nameList, aaAlignList))
# Remove sequences with at least one stop codon.
with open(name6, "w") as newFile:
    for key, value in aaAlignDict.items():
        if value.find('*') == -1:
            newFile.write(key + '\n' + value + '\n')
        else:
            pass
    newFile.close()
 
##################
# Alignment.
##################

# Create alignment folder. 
if not os.path.exists(path + "/06. alignment"):
    os.chdir(path)
    os.makedirs('06. alignment')
    alignLoc = path + '/06. alignment'
    logging.info('%s directory created.' % (alignLoc))
os.chdir(path + '/06. alignment')

# Create amino acid multiple alignments.
alignment = AlignIO.read(open(path + '/05. aaTrimmed/' + name6), "fasta")
alignLen = alignment.get_alignment_length()
fastaAlign = alignment.format("fasta")
clustalAlign = alignment.format("clustal")
name4 = name6.replace('.fasta','_aligned.fasta')
with open(name4, "w") as newFile:
    newFile.write(fastaAlign)
    print(green('\nAmino acid fasta alignment created.'))
    logging.info('Amino acid fasta alignment (%s) created.' % (name4))
    newFile.close()
name5 = name6.replace('.fasta','_aligned.clustal')
with open(name5, "w") as newFile:
    newFile.write(clustalAlign)
    print(green('\nAmino acid clustal alignment (%s) created.'))
    logging.info('Amino acid clustal alignment (%s) created.' % (name5))
    newFile.close()
    
# Create nucleotide multiple alignments.
alignment2 = AlignIO.read(open(path + '/04. ntTrimmed/' + name3), "fasta")
alignLen2 = alignment2.get_alignment_length()
fastaAlign2 = alignment2.format("fasta")
clustalAlign2 = alignment2.format("clustal")
name4 = name3.replace('.fasta','_aligned.fasta')
with open(name4, "w") as newFile:
    newFile.write(fastaAlign2)
    print(green('\nNucleotide fasta alignment created.'))
    logging.info('Nucleotide fasta alignment (%s") created.' % (name4))
    newFile.close()
name5 = name3.replace('.fasta','_aligned.clustal')
with open(name5, "w") as newFile:
    newFile.write(clustalAlign2)
    print(green('\nNucleotide clustal alignment created.'))
    logging.info('Nucleotide clustal alignment (%s) created.' % (name5))
    newFile.close()

##################
# Export as .xlsx.
##################

# Create workbook.
workbook = xlsxwriter.Workbook(path + '/06. alignment/' + locStr + '_alignment.xlsx')
logging.info('Excel spreadsheet created as "%s.xlsx".' % (locStr))

# Cell formatting rules. 
# General.
general_format = workbook.add_format()
general_format.set_align('center')
general_format.set_align('vcenter')
# Titles.
title_format = workbook.add_format({'bold': True, 'font_size': 12})
title_format.set_align('center')
# Statistics.
stats_format = workbook.add_format({'num_format': '#,##0.0'})
stats_format.set_align('center')
stats_format.set_align('vcenter')
# Wells.
wellList_format = workbook.add_format({'font_size': 11})
wellID_format = workbook.add_format({'font_size': 12})
wellID_format.set_align('center')
# Residue numbers.
residue_format = workbook.add_format({'font_size': 10})
residue_format.set_align('center')
# Sequences.
sequence_format = workbook.add_format({'font_size': 10})
sequence_format.set_align('center')
sequence_format.set_align('vcenter')
sequence_format.set_font_name('Lucida Console')
logging.info('Cell formatting rules are set.')

##################
# 'All AA Seq' worksheet.
##################

# Create worksheet for all amino acid sequences.
worksheet1 = workbook.add_worksheet('All AA Seq')
worksheet1.hide_gridlines(option = 2)
worksheet1.set_column(0, 0, 20)
worksheet1.set_column(1, alignLen, 2)
worksheet1.freeze_panes(0, 1)
logging.info('All AA Seq worksheet created.')
# Amino acid alignment data extraction.
nameRegex = re.compile(r'>(.*)')
seqRegex = re.compile(r'(?<!>)([A-Z]{2,})(?!\d|[a-z]|_)')
stopRegex = re.compile(r'([*]+[A-Z]*)')
shortNameRegex = re.compile(r'([_][M][\w]*)')
alignFile = locStr + ('_aaTrimmed_aligned.fasta')
with open(alignFile, 'r') as inFile:
    lines1 = inFile.read()
    # Remove '_aaTrimmed' from names.
    linesTrim1 = aaTrimRegex.sub('', lines1)
    # Find names.
    nameListaa = nameRegex.findall(linesTrim1)
    # Remove formatting.
    lines2 = lines1.replace('\n', '')
    # Remove truncated sequences.
    linesTrim2 = stopRegex.sub('', lines2)
    # Find sequences.
    aaList = seqRegex.findall(linesTrim2)
    # Extract only well IDs without other formatting.
    lines1Trim = shortNameRegex.sub('', lines1)
    shortWellListAa = nameRegex.findall(lines1Trim)
    inFile.close()
# Write well IDs.
worksheet1.write(0, 0, 'ID', title_format)
idRow = 2
for name in nameListaa:
    worksheet1.write(idRow, 0, name, wellID_format)
    idRow += 1
    logging.info('%s written to All AA Seq worksheet.' % (name))
# Write all amino acid sequences.
worksheet1.write(0, 6, 'Amino Acid Sequence', title_format)
seqRow = 2
seqCol = 1
for aa in aaList:
    letterList = list(aa)
    for letter in letterList:
        worksheet1.write(seqRow, seqCol, letter, sequence_format)
        seqCol +=1
    seqRow += 1
    seqCol = 1
logging.info('All Amino acid sequences written to All AA Seq worksheet.')
# Write amino acid residue numbers above sequences.
numberList = list(range(1, alignLen + 1))
residueCol = 1
for number in numberList:
    worksheet1.write(1, residueCol, number, residue_format)
    residueCol += 1

##################
# 'Unique AA Seq' worksheet.
##################

# Create worksheet for unique amino acid sequences.
worksheet2 = workbook.add_worksheet('Unique AA Seq')
worksheet2.hide_gridlines(option = 2)
worksheet2.set_column(1, alignLen, 2)
worksheet2.freeze_panes(0, 1)
logging.info('Unique AA Seq worksheet created.')
# Create list of unique amino acid sequences ordered by frequency.
uniqueAa = OrderedCounter(aaList)
uniqueAa = uniqueAa.most_common()
uniqueDictAa = dict(uniqueAa)
# Write unique amino acid sequences.
worksheet2.write(0, 6, 'Amino Acid Sequence', title_format)
seqRow = 2
seqCol = 1
for seq in uniqueDictAa.keys():
    letterList = list(seq)
    for letter in letterList:
        worksheet2.write(seqRow, seqCol, letter, sequence_format)
        seqCol += 1
    seqRow += 1
    seqCol = 1
logging.info('Unique sequences written to Unique AA Seq worksheet.')    
# # Write sequence counts.
count = list(uniqueDictAa.values())
countRow = 2
countCol = alignLen + 1
worksheet2.write(0, alignLen + 1, 'Count', title_format)
for number in count:
    worksheet2.write_number(countRow, countCol, number, general_format)
    countRow += 1
logging.info('Sequence counts written to Unique AA Seq worksheet.')
# Assign arbitrary IDs to each unique amino acid sequence.
worksheet2.write(0, 0, 'ID', title_format)
numberList = list(range(1, len(uniqueAa) + 1))
idRow = 2
for number in numberList:
    worksheet2.write_number(idRow, 0, number, general_format)
    idRow += 1
# Write amino acid residue numbers above sequences.
numberList = list(range(1, alignLen + 1))
residueCol = 1
for number in numberList:
    worksheet2.write(1, residueCol, number, residue_format)
    residueCol += 1

##################
# Ordered values: Amino Acids
##################

# Get ordered list of wells that correspond to unique sequences; necessary for subsequent statistics.
newaaDict = dict(zip(shortWellListAa, aaList))
orderedSeq = []
for key in uniqueDictAa.keys():
    orderedSeq.append(key)
orderedIndex = []
for seq in orderedSeq:
    for key, value in newaaDict.items():
        if seq in value:
            orderedIndex.append(key)
# Associate specific well IDs with corresponding unique sequence (amino acids).
countID = []
begin = 0
for uniqueSeq, count in uniqueDictAa.items():
    end = int(count) + begin
    countID.append(orderedIndex[begin:end])
    begin += count
# Write specific IDs to worksheet.
worksheet2.write(0, alignLen + 2, 'Wells', title_format)
wellRow = 2
wellCol = alignLen + 2
countIDregex = re.compile(r"([A-Z][0-1][0-9])")
sep = ', '
for wellList in countID:
    wellList = countIDregex.findall(str(wellList))
    wellList = sep.join(wellList)
    worksheet2.write(wellRow, wellCol, wellList, wellList_format)
    wellRow += 1

##################
# 'All NT Seq' worksheet.
##################

# Create worksheet for all nucleotide sequences.
worksheet3 = workbook.add_worksheet('All NT Seq')
worksheet3.hide_gridlines(option=2)
logging.info('All NT Seq worksheet created.')
worksheet3.set_column(0, 0, 20)
worksheet3.set_column(1, alignLen2, 3)
worksheet3.freeze_panes(0, 1)
# Nucleotide alignment data extraction.
alignFile2 = locStr + ('_ntTrimmed_aligned.fasta')
with open(alignFile2, 'r') as inFile:
    lines3 = inFile.read()
    # Remove '_ntTrimmed' from names.
    linesTrim3 = ntTrimRegex.sub('', lines3)
    # Find names.
    nameListnt = nameRegex.findall(linesTrim3)
    # Remove formatting.
    linesTrim4 = lines3.replace('\n', '')
    # Find sequences.
    ntList = seqRegex.findall(linesTrim4)
    # Extract only well IDs without other formatting.
    lines3Trim = shortNameRegex.sub('', lines3)
    shortWellListNt = nameRegex.findall(lines3Trim)
    inFile.close()
# Write well IDs.
worksheet3.write(0, 0, 'ID', title_format)  
idRow = 2
for name in nameListnt:
    worksheet3.write(idRow, 0, name, wellID_format)
    idRow += 1
    logging.info('"%s" written to All NT Seq worksheet.' % (name))
# Write all nucleotide sequences.
worksheet3.write(0, 6, 'Nucleotide Sequence', title_format)
seqRow = 2
seqCol = 1
for nt in ntList:
    letterList = list(nt)
    for letter in letterList:
        worksheet3.write(seqRow, seqCol, letter, sequence_format)
        seqCol +=1
    seqRow += 1
    seqCol = 1
logging.info('Nucleotide sequences (broken up) written to allSeq - NT worksheet.')
# Write nucleotide base pair numbers above sequences.
numberList2 = list(range(1, alignLen2 + 1))
bpCol = 1
for number in numberList2:
    worksheet3.write(1, bpCol, number, residue_format)
    bpCol += 1

##################
# 'Unique NT Seq' worksheet.
##################

# Create worksheet for unique nucleotide sequences.
worksheet4 = workbook.add_worksheet('Unique NT Seq')
worksheet4.hide_gridlines(option=2)
worksheet4.set_column(1, alignLen2, 3)
worksheet4.freeze_panes(0, 1)
logging.info('Unique NT Seq worksheet created.')
# Create list of unique nucleotide sequences ordered by frequency.
uniqueNt = OrderedCounter(ntList)
uniqueNt = uniqueNt.most_common()
uniqueDictNt = dict(uniqueNt)
# Write unique nucleotide sequences.
worksheet4.write(0, 6, 'Nucleotide Sequence', title_format)
seqRow = 2
seqCol = 1
for seq in uniqueDictNt.keys():
    letterList = list(seq)
    for letter in letterList:
        worksheet4.write(seqRow, seqCol, letter, sequence_format)
        seqCol += 1
    seqRow += 1
    seqCol = 1
logging.info('Unique sequences written to Unique NT Seq worksheet.')    
# Write sequence counts.
count = list(uniqueDictNt.values())
countRow = 2
countCol = alignLen2 + 1
worksheet4.write(0, alignLen2 + 1, 'Count', title_format)
for number in count:
    worksheet4.write_number(countRow, countCol, number, general_format)
    countRow += 1
logging.info('Sequence counts written to Unique NT Seq worksheet.')
# Assign arbitrary IDs to each unique nucleotide sequence.
worksheet4.write(0, 0, 'ID', title_format)
numberList = list(range(1,len(uniqueNt)+1))
idRow = 1
for number in numberList:
    worksheet4.write_number(idRow, 0, number, general_format)
    idRow += 1
# Write nucleotide base pair numbers above sequences.
bpCol = 1
for number in numberList2:
    worksheet4.write(1, bpCol, number, residue_format)
    bpCol += 1

##################
# Ordered values: Nucleotides
##################

# Get ordered list of wells that correspond to unique sequences; necessary for attributing wells to unique sequences.
newntDict = dict(zip(shortWellListNt, ntList))
orderedSeq2 = []
for key in uniqueDictNt.keys():
    orderedSeq2.append(key)
orderedIndex2 = []
for seq in orderedSeq2:
    for key, value in newntDict.items():
        if seq in value:
            orderedIndex2.append(key)
# Associate specific well IDs with corresponding unique sequence (nucleotides).
countID = []
begin = 0
for uniqueSeq, count in uniqueDictNt.items():
    end = int(count) + begin
    countID.append(orderedIndex2[begin:end])
    begin += count
# Write specific IDs to worksheet.
worksheet4.write(0, alignLen2 + 2, 'Wells', title_format)
wellRow = 2
wellCol = alignLen2 + 2
countIDregex = re.compile(r"([A-Z][0-1][0-9])")
sep = ', '
for wellList in countID:
    wellList = countIDregex.findall(str(wellList))
    wellList = sep.join(wellList)
    worksheet4.write(wellRow, wellCol, wellList, wellList_format)
    wellRow += 1

##################
# Final workbook formatting.
##################

# Transform data into proper Excel-formatted tables without any design style applied.
worksheet1.add_table(1, 0, len(nameListaa) + 1, alignLen, {'header_row': False, 'style': None})
worksheet2.add_table(1, 0, len(uniqueAa) + 1, alignLen + 2, {'header_row': False, 'style': None})
worksheet3.add_table(1, 0, len(nameListnt) + 1, alignLen2, {'header_row': False, 'style': None})
worksheet4.add_table(1, 0, len(uniqueNt) + 1, alignLen2 + 2, {'header_row': False, 'style': None})

# Close .xlsx file.
workbook.close() 

# Conclusion.
print(green('\nExcel alignment exported as %s_aaTrimmed_aligned.xlsx.' % (locStr)))
logging.info('Excel alignment exported as %s_aaTrimmed_aligned.xlsx.' % (locStr))   
print(green('\nAnalysis finished. See log file for details.'))
print(cyan('''\nPost-analysis help:
            
Non-trimmed files are in the "noTrim" folder and couldn't be trimmed because:
A) There's no sequence.
    or
B) The 5' trim site couldn't be found.
        
In the latter case, look at the corresponding ab1 file and assess whether the bases were called correctly or not.

When changing base calls, create a brand new file (do not overwrite the raw file) and type called bases in lowercase to distinguish them from bases called by the machine.'''))
logging.info('phageDisplaySeqAnalysis.py finished running.')

# Shutdown logging.
logging.shutdown()